{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import random\n",
    "import time\n",
    "import jsonlines\n",
    "import geopy\n",
    "import geopy.distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_data(file_path):\n",
    "    data_file = open(file_path)\n",
    "    data = []\n",
    "    for line in data_file:\n",
    "        data.append(json.loads(line))\n",
    "    data = pd.DataFrame(data)\n",
    "    data_file.close()\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question11_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # How many {category} businesses are there in {city}, {state}?\n",
    "    city_list = data[\"city\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        city = random.choice(city_list)\n",
    "        cat = random.choice(categories)\n",
    "        sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"How many {} businesses are there in {}, {}?\".format(cat, city, data[data[\"city\"] == city][\"state\"].iloc[0])\n",
    "            answer = len(sub_data)\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question12_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # How many bussinesses are there in {postal_code} area of {city}, {state}?\n",
    "    postalcode_list = data[\"postal_code\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        postalcode = random.choice(postalcode_list)\n",
    "        sub_data = data[data[\"postal_code\"] == postalcode]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"How many businesses are there in {} area of {}, {}?\".format(postalcode, sub_data[\"city\"].iloc[0], sub_data[\"state\"].iloc[0])\n",
    "            answer = len(sub_data)\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question13_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # Which {category} business has the highest star rating in {city}, {state}?\n",
    "    city_list = data[\"city\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        city = random.choice(city_list)\n",
    "        cat = random.choice(categories)\n",
    "        sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"Which {} business has the highest star rating in {}, {}?\".format(cat, city, data[data[\"city\"] == city][\"state\"].iloc[0])\n",
    "            answer = sub_data[sub_data[\"stars\"] == max(sub_data[\"stars\"])][\"name\"].iloc[0]\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question14_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # Which {category} business has the highest review count in {city}, {state}?\n",
    "    city_list = data[\"city\"].unique()\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        city = random.choice(city_list)\n",
    "        cat = random.choice(categories)\n",
    "        sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"Which {} business has the highest review count in {}, {}?\".format(cat, city, data[data[\"city\"] == city][\"state\"].iloc[0])\n",
    "            answer = sub_data[sub_data[\"review_count\"] == max(sub_data[\"review_count\"])][\"name\"].iloc[0]\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question15_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # What is the average review counts of businesses within a 5-mile radius from {name}?\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        random_indices = random.randint(0, len(data) - 1)\n",
    "        selected_data = data.iloc[random_indices]\n",
    "        latitude = selected_data[\"latitude\"]\n",
    "        longitude = selected_data[\"longitude\"]\n",
    "        _, lo_max, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=90)\n",
    "        _, lo_min, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=270)\n",
    "        la_max, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=0)\n",
    "        la_min, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=180)\n",
    "        sub_data = data[(data[\"latitude\"] <= la_max) & (data[\"latitude\"] >= la_min) & (data[\"longitude\"] <= lo_max) & (data[\"longitude\"] >= lo_min)]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"What is the average review counts of businesses within a 5-mile radius from {}?\".format(selected_data[\"name\"])\n",
    "            answer = round(sub_data[\"review_count\"].mean(), 2)\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question16_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # Which is the nearest {category} business to {name}?\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        random_indices = random.randint(0, len(data) - 1)\n",
    "        selected_data = data.iloc[random_indices]\n",
    "        city = selected_data[\"city\"]\n",
    "        try:\n",
    "            cat = random.choice(categories)\n",
    "            sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n",
    "            if len(sub_data) != 0:\n",
    "                question = \"Which is the nearest {} business to {}?\".format(cat, selected_data[\"name\"])\n",
    "                answer = sub_data[(sub_data[\"latitude\"] - selected_data[\"latitude\"])**2 + (sub_data[\"longitude\"] - selected_data[\"longitude\"])**2 == min((sub_data[\"latitude\"] - selected_data[\"latitude\"])**2 + (sub_data[\"longitude\"] - selected_data[\"longitude\"])**2)][\"name\"].iloc[0]\n",
    "                questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "                question_id += 1\n",
    "        except:\n",
    "            print(cat)\n",
    "            input()\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question17_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # Can you recommend a {category} business with the highest star rating within a 5-mile radius of {address}?\n",
    "    target = question_id + num_questions_per_template\n",
    "    while question_id < target:\n",
    "        random_indices = random.randint(0, len(data) - 1)\n",
    "        selected_data = data.iloc[random_indices]\n",
    "        address = selected_data[\"address\"]\n",
    "        latitude = selected_data[\"latitude\"]\n",
    "        longitude = selected_data[\"longitude\"]\n",
    "        city = selected_data[\"city\"]\n",
    "        cat = random.choice(categories)\n",
    "        sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n",
    "        _, lo_max, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=90)\n",
    "        _, lo_min, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=270)\n",
    "        la_max, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=0)\n",
    "        la_min, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=180)\n",
    "        sub_data = data[(data[\"latitude\"] <= la_max) & (data[\"latitude\"] >= la_min) & (data[\"longitude\"] <= lo_max) & (data[\"longitude\"] >= lo_min)]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"Can you recommend a {} business with the highest star rating within a 5-mile radius of {}?\".format(cat, address)\n",
    "            answer = sub_data[sub_data[\"stars\"] == max(sub_data[\"stars\"])][\"name\"].iloc[0]\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question18_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # How many businesses are not open currently in {city}?\n",
    "    target = question_id + num_questions_per_template\n",
    "    city_list = data[\"city\"].unique()\n",
    "    while question_id < target:\n",
    "        city = random.choice(city_list)\n",
    "        sub_data = data[data[\"city\"] == city]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"How many businesses are not open currently in {}?\".format(city)\n",
    "            answer = len(sub_data[sub_data[\"is_open\"] == 0])\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question19_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # What is the average star rating of {category} businesses in {city}?\n",
    "    target = question_id + num_questions_per_template\n",
    "    city_list = data[\"city\"].unique()\n",
    "    while question_id < target:\n",
    "        city = random.choice(city_list)\n",
    "        cat = random.choice(categories)\n",
    "        sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"What is the average star rating of {} businesses in {}?\".format(cat, city)\n",
    "            answer = round(sub_data[\"stars\"].mean(), 2)\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question20_gen(data, num_questions_per_template, questions, question_id, categories):\n",
    "    # Which region has most bussinesses in {city}, {state}?\n",
    "    target = question_id + num_questions_per_template\n",
    "    city_list = data[\"city\"].unique()\n",
    "    while question_id < target:\n",
    "        city = random.choice(city_list)\n",
    "        state = data[data[\"city\"] == city][\"state\"].iloc[0]\n",
    "        sub_data = data[(data[\"city\"] == city) & (data[\"state\"] == state)]\n",
    "        if len(sub_data) != 0:\n",
    "            question = \"Which postal code region has most bussinesses in {}, {}?\".format(city, state)\n",
    "            answer = sub_data[\"postal_code\"].value_counts().index[0]\n",
    "            questions.append({\"qid\": \"hard-yelp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "            question_id += 1\n",
    "    return questions, question_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = time.time()\n",
    "file_path = \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/yelp/yelp_academic_dataset_business.json\"\n",
    "data = read_data(file_path)\n",
    "num_questions_per_template = 10\n",
    "question_id = 0\n",
    "questions = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories_list = data[\"categories\"].unique()\n",
    "categories = []\n",
    "for cat in categories_list:\n",
    "    if cat != None:\n",
    "        categories += cat.split(\", \")\n",
    "categories = sorted(list(set(categories)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_108821/3954776033.py:8: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
      "  sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0000', 'question': 'How many Home Organization businesses are there in Franklin, TN?', 'answer': 4}, {'qid': 'hard-yelp-0001', 'question': 'How many Hair Salons businesses are there in Trooper, PA?', 'answer': 1}, {'qid': 'hard-yelp-0002', 'question': 'How many Tours businesses are there in Wayne, PA?', 'answer': 1}, {'qid': 'hard-yelp-0003', 'question': 'How many Southern businesses are there in Glenolden, PA?', 'answer': 1}, {'qid': 'hard-yelp-0004', 'question': 'How many Brewpubs businesses are there in Lumberton, NJ?', 'answer': 1}, {'qid': 'hard-yelp-0005', 'question': 'How many Wedding Planning businesses are there in Cinnaminson, NJ?', 'answer': 1}, {'qid': 'hard-yelp-0006', 'question': 'How many Florists businesses are there in Lansdale, PA?', 'answer': 5}, {'qid': 'hard-yelp-0007', 'question': 'How many Wholesale Stores businesses are there in Boise, ID?', 'answer': 7}, {'qid': 'hard-yelp-0008', 'question': 'How many Discount Store businesses are there in Speedway, IN?', 'answer': 1}, {'qid': 'hard-yelp-0009', 'question': 'How many Auto Glass Services businesses are there in Saint Petersburg, FL?', 'answer': 5}]\n",
      "Time elapsed for Question 11: 205.68362951278687 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 11\n",
    "questions, question_id = question11_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 11: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0010', 'question': 'How many businesses are there in 18917 area of Dublin, PA?', 'answer': 25}, {'qid': 'hard-yelp-0011', 'question': 'How many businesses are there in T5L 2L4 area of Edmonton, AB?', 'answer': 1}, {'qid': 'hard-yelp-0012', 'question': 'How many businesses are there in 19120 area of Philadelphia, PA?', 'answer': 161}, {'qid': 'hard-yelp-0013', 'question': 'How many businesses are there in T6H 1M2 area of Edmonton, AB?', 'answer': 1}, {'qid': 'hard-yelp-0014', 'question': 'How many businesses are there in T6V 1B1 area of Edmonton, AB?', 'answer': 5}, {'qid': 'hard-yelp-0015', 'question': 'How many businesses are there in T5N 1R1 area of Edmonton, AB?', 'answer': 3}, {'qid': 'hard-yelp-0016', 'question': 'How many businesses are there in T5Y 2W7 area of Edmonton, AB?', 'answer': 5}, {'qid': 'hard-yelp-0017', 'question': 'How many businesses are there in T8H 2A2 area of Sherwood Park, AB?', 'answer': 3}, {'qid': 'hard-yelp-0018', 'question': 'How many businesses are there in 08026 area of Gibbsboro, NJ?', 'answer': 18}, {'qid': 'hard-yelp-0019', 'question': 'How many businesses are there in T5P 2S1 area of Edmonton, AB?', 'answer': 1}]\n",
      "Time elapsed for Question 12: 215.01229214668274 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 12\n",
    "questions, question_id = question12_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 12: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0020', 'question': 'Which Dentists business has the highest star rating in Wenonah, NJ?', 'answer': 'Nester & Wyckoff General Dentistry'}, {'qid': 'hard-yelp-0021', 'question': 'Which Sandwiches business has the highest star rating in Garnet Valley, PA?', 'answer': 'Lancaster County Sausage'}, {'qid': 'hard-yelp-0022', 'question': 'Which Restaurants business has the highest star rating in Affton, MO?', 'answer': 'Sushi Hana'}, {'qid': 'hard-yelp-0023', 'question': 'Which Dry Cleaning business has the highest star rating in Sun City Center, FL?', 'answer': 'Amazing Cleaners'}, {'qid': 'hard-yelp-0024', 'question': 'Which French business has the highest star rating in Indian Rocks Beach, FL?', 'answer': 'Cafe de Paris Bakery'}, {'qid': 'hard-yelp-0025', 'question': 'Which ATV Rentals/Tours business has the highest star rating in Sparks, NV?', 'answer': 'SWARMFIRE'}, {'qid': 'hard-yelp-0026', 'question': 'Which Videos & Video Game Rental business has the highest star rating in Nashville, TN?', 'answer': 'Gamestop'}, {'qid': 'hard-yelp-0027', 'question': 'Which Periodontists business has the highest star rating in St Louis, MO?', 'answer': 'Nikodem Dental'}, {'qid': 'hard-yelp-0028', 'question': 'Which Formal Wear business has the highest star rating in Deptford, NJ?', 'answer': \"Men's Wearhouse\"}, {'qid': 'hard-yelp-0029', 'question': 'Which Bars business has the highest star rating in Magnolia, NJ?', 'answer': 'The Laughing Fox Tavern'}]\n",
      "Time elapsed for Question 13: 234.02396774291992 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 13\n",
    "questions, question_id = question13_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 13: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_108821/629839948.py:8: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
      "  sub_data = data[(data[\"city\"] == city) & (data[\"categories\"].str.contains(cat))]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0030', 'question': 'Which Condominiums business has the highest review count in Tampa, FL?', 'answer': 'The Place At Channelside'}, {'qid': 'hard-yelp-0031', 'question': 'Which Imported Food business has the highest review count in Santa Barbara, CA?', 'answer': 'El Sitio'}, {'qid': 'hard-yelp-0032', 'question': 'Which Windshield Installation & Repair business has the highest review count in Lutz, FL?', 'answer': 'Gerber Collision & Glass'}, {'qid': 'hard-yelp-0033', 'question': 'Which Diagnostic Imaging business has the highest review count in New Orleans, LA?', 'answer': 'All American Healthcare New Orleans'}, {'qid': 'hard-yelp-0034', 'question': 'Which Restaurants business has the highest review count in Fortville, IN?', 'answer': \"Simeri's Italian\"}, {'qid': 'hard-yelp-0035', 'question': 'Which Bridal business has the highest review count in Ardmore, PA?', 'answer': 'Bijou Bridal & Special Occasion Ardmore'}, {'qid': 'hard-yelp-0036', 'question': 'Which Pet Stores business has the highest review count in Oakville, MO?', 'answer': 'Treats Unleashed'}, {'qid': 'hard-yelp-0037', 'question': 'Which Movers business has the highest review count in Cherry Hill, NJ?', 'answer': 'All My Sons Moving & Storage'}, {'qid': 'hard-yelp-0038', 'question': 'Which Sushi Bars business has the highest review count in Carpinteria, CA?', 'answer': 'Sushi Teri'}, {'qid': 'hard-yelp-0039', 'question': 'Which Periodontists business has the highest review count in Fairless Hills, PA?', 'answer': 'USA Dental Care'}]\n",
      "Time elapsed for Question 14: 246.4704670906067 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 14\n",
    "questions, question_id = question14_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 14: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0040', 'question': 'What is the average review counts of businesses within a 5-mile radius from Uptown Delivery Pharmacy?', 'answer': 101.2}, {'qid': 'hard-yelp-0041', 'question': 'What is the average review counts of businesses within a 5-mile radius from Residence Inn Tampa Suncoast Parkway at NorthPointe Village?', 'answer': 27.21}, {'qid': 'hard-yelp-0042', 'question': \"What is the average review counts of businesses within a 5-mile radius from Kohl's?\", 'answer': 50.91}, {'qid': 'hard-yelp-0043', 'question': 'What is the average review counts of businesses within a 5-mile radius from Massage Heights?', 'answer': 16.17}, {'qid': 'hard-yelp-0044', 'question': 'What is the average review counts of businesses within a 5-mile radius from Sublime Yoga and Wellness?', 'answer': 56.52}, {'qid': 'hard-yelp-0045', 'question': 'What is the average review counts of businesses within a 5-mile radius from Duvall Flooring?', 'answer': 41.82}, {'qid': 'hard-yelp-0046', 'question': 'What is the average review counts of businesses within a 5-mile radius from Bala Institute of Oral Surgery?', 'answer': 41.37}, {'qid': 'hard-yelp-0047', 'question': 'What is the average review counts of businesses within a 5-mile radius from Paper Source?', 'answer': 78.81}, {'qid': 'hard-yelp-0048', 'question': \"What is the average review counts of businesses within a 5-mile radius from Maro Brothers' Discount Liquor Mart?\", 'answer': 23.25}, {'qid': 'hard-yelp-0049', 'question': 'What is the average review counts of businesses within a 5-mile radius from Studio 210 Salon?', 'answer': 38.56}]\n",
      "Time elapsed for Question 15: 246.5169322490692 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 15\n",
    "questions, question_id = question15_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 15: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0050', 'question': 'Which is the nearest Chinese business to Sugar Mamas Custom Cakes?', 'answer': 'Hong Kong Restaurant'}, {'qid': 'hard-yelp-0051', 'question': 'Which is the nearest Cooking Classes business to 5-Star Staffing Solutions?', 'answer': 'Sur La Table'}, {'qid': 'hard-yelp-0052', 'question': 'Which is the nearest Orthopedists business to TNS Diamonds?', 'answer': \"Benjamin's On the Row\"}, {'qid': 'hard-yelp-0053', 'question': 'Which is the nearest Immigration Law business to MYNT Cannabis Dispensary Downtown Reno?', 'answer': 'Law Office Of Mark Mausert'}, {'qid': 'hard-yelp-0054', 'question': 'Which is the nearest Himalayan/Nepalese business to Roller City?', 'answer': 'Himalaya Kabob Korner'}, {'qid': 'hard-yelp-0055', 'question': 'Which is the nearest Weight Loss Centers business to Everingham Elecrtic?', 'answer': 'Fresh Vitality Medical Spa & Center for Health'}, {'qid': 'hard-yelp-0056', 'question': 'Which is the nearest Bangladeshi business to Harcourts NV1?', 'answer': 'Dawat Fast Food & Restaurant'}, {'qid': 'hard-yelp-0057', 'question': \"Which is the nearest Cuban business to Moe's Southwest Grill?\", 'answer': \"Captain Bill's\"}, {'qid': 'hard-yelp-0058', 'question': 'Which is the nearest Preschools business to Cumberland Transit?', 'answer': 'Primrose School of Nashville Midtown'}, {'qid': 'hard-yelp-0059', 'question': 'Which is the nearest Divorce & Family Law business to Clearwater Dental Associates?', 'answer': 'Carey Leisure & Neal'}]\n",
      "Time elapsed for Question 16: 249.02199578285217 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 16\n",
    "questions, question_id = question16_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 16: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0060', 'question': 'Can you recommend a Art Supplies business with the highest star rating within a 5-mile radius of 615 Channelside Dr?', 'answer': 'Thach Used Tires'}, {'qid': 'hard-yelp-0061', 'question': 'Can you recommend a Rock Climbing business with the highest star rating within a 5-mile radius of 2428 Nolensville Rd?', 'answer': 'barre3 Nashville - The Gulch'}, {'qid': 'hard-yelp-0062', 'question': 'Can you recommend a Salvadoran business with the highest star rating within a 5-mile radius of 540 Simpson Dr?', 'answer': 'Monkey Fish Toys'}, {'qid': 'hard-yelp-0063', 'question': 'Can you recommend a General Festivals business with the highest star rating within a 5-mile radius of 719 Toulouse St?', 'answer': 'Ann Becnel Companion Dogs'}, {'qid': 'hard-yelp-0064', 'question': 'Can you recommend a Pet Waste Removal business with the highest star rating within a 5-mile radius of 598 Sam Ridley Pkwy W?', 'answer': \"WilkerSon's Heating & Cooling\"}, {'qid': 'hard-yelp-0065', 'question': 'Can you recommend a Title Loans business with the highest star rating within a 5-mile radius of 75 Queensway Dr?', 'answer': 'Native Roots Landscaping'}, {'qid': 'hard-yelp-0066', 'question': 'Can you recommend a Belgian business with the highest star rating within a 5-mile radius of 5930 Daley St?', 'answer': 'Isla Vista Community Bike Center'}, {'qid': 'hard-yelp-0067', 'question': 'Can you recommend a Notaries business with the highest star rating within a 5-mile radius of 101 International Dr, Ste 106?', 'answer': 'Staybridge Suites - Franklin'}, {'qid': 'hard-yelp-0068', 'question': 'Can you recommend a Real Estate Agents business with the highest star rating within a 5-mile radius of ?', 'answer': 'So Fly Studio'}, {'qid': 'hard-yelp-0069', 'question': 'Can you recommend a Pediatricians business with the highest star rating within a 5-mile radius of 2 Engineer Rd?', 'answer': 'Darin Chiropractic'}]\n",
      "Time elapsed for Question 17: 250.07569074630737 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 17\n",
    "questions, question_id = question17_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 17: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0070', 'question': 'How many businesses are not open currently in Cottage Hills?', 'answer': 0}, {'qid': 'hard-yelp-0071', 'question': 'How many businesses are not open currently in Apollo beach?', 'answer': 0}, {'qid': 'hard-yelp-0072', 'question': 'How many businesses are not open currently in West Norriton ?', 'answer': 0}, {'qid': 'hard-yelp-0073', 'question': 'How many businesses are not open currently in Tarpon springs?', 'answer': 0}, {'qid': 'hard-yelp-0074', 'question': 'How many businesses are not open currently in Mt Laurel Township?', 'answer': 0}, {'qid': 'hard-yelp-0075', 'question': 'How many businesses are not open currently in Glen Carbon?', 'answer': 3}, {'qid': 'hard-yelp-0076', 'question': 'How many businesses are not open currently in Washington Township?', 'answer': 2}, {'qid': 'hard-yelp-0077', 'question': 'How many businesses are not open currently in McCordsville?', 'answer': 0}, {'qid': 'hard-yelp-0078', 'question': 'How many businesses are not open currently in Franconia?', 'answer': 0}, {'qid': 'hard-yelp-0079', 'question': 'How many businesses are not open currently in Maryland Heights?', 'answer': 37}]\n",
      "Time elapsed for Question 18: 250.36629509925842 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 18\n",
    "questions, question_id = question18_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 18: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0080', 'question': 'What is the average star rating of Limos businesses in St. Ann?', 'answer': 5.0}, {'qid': 'hard-yelp-0081', 'question': 'What is the average star rating of Television Service Providers businesses in Saint Petersburg?', 'answer': 1.5}, {'qid': 'hard-yelp-0082', 'question': 'What is the average star rating of Body Shops businesses in Brentwood?', 'answer': 3.12}, {'qid': 'hard-yelp-0083', 'question': 'What is the average star rating of Asian Fusion businesses in Ladue?', 'answer': 3.0}, {'qid': 'hard-yelp-0084', 'question': 'What is the average star rating of Pet Groomers businesses in Hermitage?', 'answer': 2.83}, {'qid': 'hard-yelp-0085', 'question': 'What is the average star rating of Car Buyers businesses in Woodbury?', 'answer': 2.0}, {'qid': 'hard-yelp-0086', 'question': 'What is the average star rating of Local Services businesses in Mount Holly?', 'answer': 3.4}, {'qid': 'hard-yelp-0087', 'question': 'What is the average star rating of Arts & Entertainment businesses in Florissant?', 'answer': 2.5}, {'qid': 'hard-yelp-0088', 'question': 'What is the average star rating of Structural Engineers businesses in Philadelphia?', 'answer': 1.0}, {'qid': 'hard-yelp-0089', 'question': 'What is the average star rating of Fitness & Instruction businesses in Ladue?', 'answer': 4.5}]\n",
      "Time elapsed for Question 19: 269.6077845096588 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 19\n",
    "questions, question_id = question19_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 19: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-yelp-0090', 'question': 'Which region has most bussinesses in Port Richey, FL?', 'answer': '34668'}, {'qid': 'hard-yelp-0091', 'question': 'Which region has most bussinesses in Spring House, PA?', 'answer': '19477'}, {'qid': 'hard-yelp-0092', 'question': 'Which region has most bussinesses in Mission Canyon, CA?', 'answer': '93105'}, {'qid': 'hard-yelp-0093', 'question': 'Which region has most bussinesses in Goodlettsville, TN?', 'answer': '37072'}, {'qid': 'hard-yelp-0094', 'question': 'Which region has most bussinesses in Upper Gwynedd, PA?', 'answer': '19446'}, {'qid': 'hard-yelp-0095', 'question': 'Which region has most bussinesses in Webster Grvs, MO?', 'answer': '63119'}, {'qid': 'hard-yelp-0096', 'question': 'Which region has most bussinesses in Schwenksville, PA?', 'answer': '19473'}, {'qid': 'hard-yelp-0097', 'question': 'Which region has most bussinesses in Brentwood, MO?', 'answer': '63144'}, {'qid': 'hard-yelp-0098', 'question': 'Which region has most bussinesses in Cheltenham Township, PA?', 'answer': '19027'}, {'qid': 'hard-yelp-0099', 'question': 'Which region has most bussinesses in Austin, TX?', 'answer': '78752'}]\n",
      "Time elapsed for Question 20: 270.4222888946533 seconds\n"
     ]
    }
   ],
   "source": [
    "# question template 20\n",
    "questions, question_id = question20_gen(data, num_questions_per_template, questions, question_id, categories)\n",
    "end_time = time.time()\n",
    "print(questions[-10:])\n",
    "print(\"Time elapsed for Question 20: {} seconds\".format(end_time - start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/yelp-hard.jsonl', mode='w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
